Using PLAZA ortholog groups from PlantGenIE inferred using OrthoMCL: ftp://plantgenie.org/Data/Cross-Species/Orthologs/PLAZA/orthologs.ORTHO.plantgenie.txt.gz
Species names: from PLAZA e.g. “potri” and “piabi”
Expression files: tab-separated file where the first column contains gene names and is called “Genes”. Here we use data from AspWood and NorWood.
# Species name in Plaza
species1_name <- "potri"
species2_name <- "piabi"
# Expression files
species1_expr_file <- "Data/AspWood_expression.txt"
species2_expr_file <- "Data/NorWood_expression.txt"
# Ortholog groups
ortholog_group_file <- "Data/orthologs.ORTHO.plantgenie.txt.gz"
# Read in ortholog groups
# =======================
# Reading in and parsing the PLAZA file takes time. Parsed results are saved
# and used if they exists.
ortholog_group_RData <- paste0("RData/orthologs-", species1_name, "-", species2_name, ".RData")
if (!file.exists(ortholog_group_RData)) {
ortho <- read.delim(ortholog_group_file)
ortho <- ortho %>%
filter(species == species1_name) %>%
group_by(gene_content) %>%
mutate(OrthoGroup = cur_group_id()) %>%
ungroup() %>%
separate_rows(gene_content, sep = ";", convert = FALSE) %>%
filter(grepl(species2_name, gene_content)) %>%
separate(gene_content, into = c("prefix", "gene_content"), sep = ":") %>%
separate_rows(gene_content, sep = ",", convert = FALSE) %>%
mutate(Species1 = gene_id, Species2 = gene_content) %>%
select(Species1, Species2, OrthoGroup)
save(ortho, file = ortholog_group_RData)
} else {
load(file = ortholog_group_RData)
}
# Read in expression data
# =======================
species1_expr <- read.delim(species1_expr_file, sep = "\t", header = TRUE)
species2_expr <- read.delim(species2_expr_file, sep = "\t", header = TRUE)
# Filter
# ======
cat (length(unique(ortho$OrthoGroup)), " ortholog groups containing:\n",
" ", length(unique(ortho$Species1)), " ", species1_name, " genes\n",
" ", length(unique(ortho$Species2)), " ", species2_name, " genes\n\n",
length(unique(species1_expr$Genes)), " expressed ", species1_name, " genes\n",
length(unique(species2_expr$Genes)), " expressed ", species2_name, " genes\n",
sep = "")
## 8235 ortholog groups containing:
## 29158 potri genes
## 35245 piabi genes
##
## 28295 expressed potri genes
## 18514 expressed piabi genes
ortho <- ortho %>%
filter(Species1 %in% species1_expr$Genes & Species2 %in% species2_expr$Genes)
species1_expr <- species1_expr[species1_expr$Genes %in% ortho$Species1,]
species2_expr <- species2_expr[species2_expr$Genes %in% ortho$Species2,]
cat ("After filtering on expressed genes with ortholog:\n",
" ", length(unique(ortho$OrthoGroup)), " ortholog groups containing: \n",
" ", length(unique(ortho$Species1)), " ", species1_name, " genes\n",
" ", length(unique(ortho$Species2)), " ", species2_name, " genes\n",
sep = "")
## After filtering on expressed genes with ortholog:
## 6574 ortholog groups containing:
## 19332 potri genes
## 12872 piabi genes
Several parameters:
cor_method <- "pearson" # pearson spearman
cor_sign <- "" # abs
norm_method <- "MR" # CLR MR
density_thr <- 0.03
randomize <- "" # rand
comparison_RData <- paste0("RData/comparison-", species1_name, "-", species2_name, "-",
cor_sign, cor_method, norm_method, density_thr, randomize, ".RData")
if (!file.exists(comparison_RData)) {
if (randomize == "rand") {
species1_expr$Genes <- sample(species1_expr$Genes, nrow(species1_expr), FALSE)
species2_expr$Genes <- sample(species2_expr$Genes, nrow(species2_expr), FALSE)
}
species1_net <- cor(t(species1_expr[,-1]), method = cor_method)
dimnames(species1_net) <- list(species1_expr$Genes, species1_expr$Genes)
species2_net <- cor(t(species2_expr[,-1]), method = cor_method)
dimnames(species2_net) <- list(species2_expr$Genes, species2_expr$Genes)
if (cor_sign == "abs") {
species1_net <- abs(species1_net)
species2_net <- abs(species2_net)
}
if (norm_method == "CLR") {
z <- scale(species1_net)
z[z < 0] <- 0
species1_net <- sqrt(t(z)**2 + z**2)
z <- scale(species2_net)
z[z < 0] <- 0
species2_net <- sqrt(t(z)**2 + z**2)
} else if (norm_method == "MR") {
R <- t(apply(species1_net, 1, rank))
species1_net <- sqrt(R * t(R))
R <- t(apply(species2_net, 1, rank))
species2_net <- sqrt(R * t(R))
}
diag(species1_net) <- 0
diag(species2_net) <- 0
R <- sort(species1_net[upper.tri(species1_net, diag = FALSE)], decreasing = TRUE)
species1_thr <- R[round(density_thr*length(R))]
plot(density(R), xlab = paste0(species1_name, " correlations"), main = "")
R <- sort(species2_net[upper.tri(species2_net, diag = FALSE)], decreasing = TRUE)
species2_thr <- R[round(density_thr*length(R))]
plot(density(R), xlab = paste0(species2_name, " correlations"), main = "")
} else {
load(file = comparison_RData)
}
cat(species1_name, "co-expr threshold", format(species1_thr, digits = 3) , "\n")
## potri co-expr threshold 18546
cat(species2_name, "co-expr threshold", format(species2_thr, digits = 3) , "\n")
## piabi co-expr threshold 12397
For each ortholog pair, p-values for for the overlap of network neighborhoods are computed in both direction.
if (!file.exists(comparison_RData)) {
comparison <- ortho
comparison$Species1.neigh <- c(NA)
comparison$Species1.ortho.neigh <- c(NA)
comparison$Species1.neigh.overlap <- c(NA)
comparison$Species1.p.val <- c(NA)
comparison$Species2.neigh <- c(NA)
comparison$Species2.ortho.neigh <- c(NA)
comparison$Species2.neigh.overlap <- c(NA)
comparison$Species2.p.val <- c(NA)
for (i in 1:nrow(ortho)) {
if (i %% 100 == 0) {
cat(i, "\n")
}
# Species 1 -> Species 2
neigh <- species1_net[ortho$Species1[i],]
neigh <- names(neigh[neigh >= species1_thr])
ortho_neigh <- species2_net[ortho$Species2[i],]
ortho_neigh <- names(ortho_neigh[ortho_neigh >= species2_thr])
ortho_neigh <- unique(ortho$Species1[ortho$Species2 %in% ortho_neigh])
N <- nrow(species1_expr)
m <- length(neigh)
n <- N-m
k <- length(ortho_neigh)
x <- length(intersect(neigh, ortho_neigh))
p_val <- 1
if (x > 1) {
p_val <- phyper(x-1, m, n, k, lower.tail = FALSE)
}
comparison$Species1.neigh[i] <- m
comparison$Species1.ortho.neigh[i] <- k
comparison$Species1.neigh.overlap[i] <- x
comparison$Species1.p.val[i] <- p_val
# Species 2 -> Species 1
neigh <- species2_net[ortho$Species2[i],]
neigh <- names(neigh[neigh >= species2_thr])
ortho_neigh <- species1_net[ortho$Species1[i],]
ortho_neigh <- names(ortho_neigh[ortho_neigh >= species1_thr])
ortho_neigh <- unique(ortho$Species2[ortho$Species1 %in% ortho_neigh])
N <- nrow(species2_expr)
m <- length(neigh)
n <- N-m
k <- length(ortho_neigh)
x <- length(intersect(neigh, ortho_neigh))
p_val <- 1
if (x > 1) {
p_val <- phyper(x-1, m, n, k, lower.tail = FALSE)
}
comparison$Species2.neigh[i] <- m
comparison$Species2.ortho.neigh[i] <- k
comparison$Species2.neigh.overlap[i] <- x
comparison$Species2.p.val[i] <- p_val
}
save(comparison, species1_thr, species2_thr, file = comparison_RData)
}
# Filter orthologs not in the networks
comparison <- comparison %>%
filter(Species1.neigh.overlap > 0 & Species2.neigh.overlap > 0)
# FDR correction
comparison$Species1.p.val <- p.adjust(comparison$Species1.p.val, method = "fdr")
comparison$Species2.p.val <- p.adjust(comparison$Species2.p.val, method = "fdr")
cat ("After filtering on gene pairs in the networks:\n",
" ", length(unique(comparison$OrthoGroup)), " ortholog groups containing: \n",
" ", length(unique(comparison$Species1)), " ", species1_name, " genes\n",
" ", length(unique(comparison$Species2)), " ", species2_name, " genes\n",
sep = "")
## After filtering on gene pairs in the networks:
## 6547 ortholog groups containing:
## 19201 potri genes
## 12835 piabi genes
# Comparsion of p-values of orthologs: species 1 -> species 2 vs species 2 -> species 1
R <- cor.test(-log10(comparison$Species1.p.val), -log10(comparison$Species2.p.val))
data.frame(s1 = -log10(comparison$Species1.p.val),
s2 = -log10(comparison$Species2.p.val)) %>%
ggplot(aes(x = s1, y = s2)) +
xlab(paste0(species1_name, " p-value (-log10)")) +
ylab(paste0(species2_name, " p-value (-log10)")) +
geom_point() +
geom_smooth(method=lm, formula = y ~ x, fill = "gainsboro") +
ggtitle(paste0("Correlation = ", format(R$estimate, digits = 3)))
# Print some summary statistics
# =============================
# Gene pairs
cat("GENE PAIRS:\n",
sum(comparison$Species1.p.val< 0.05), " conserved ", species1_name, " gene pairs (",
format(sum(comparison$Species1.p.val< 0.05)/nrow(comparison), digits = 3), ")\n",
sum(comparison$Species2.p.val< 0.05), " conserved ", species2_name, " gene pairs (",
format(sum(comparison$Species2.p.val< 0.05)/nrow(comparison), digits = 3), ")\n",
sum(comparison$Species1.p.val< 0.05 & comparison$Species2.p.val< 0.05), " reciprocally conserved gene pairs (",
format(sum(comparison$Species1.p.val< 0.05 & comparison$Species2.p.val< 0.05)/nrow(comparison), digits = 3), ")\n", sep = "")
## GENE PAIRS:
## 31968 conserved potri gene pairs (0.366)
## 35765 conserved piabi gene pairs (0.409)
## 25105 reciprocally conserved gene pairs (0.287)
# Genes
comparison_species1 <- comparison %>%
group_by(Species1) %>%
arrange(Species1.p.val) %>%
slice(1)
comparison_species2 <- comparison %>%
group_by(Species2) %>%
arrange(Species2.p.val) %>%
slice(1)
comparison_species1_12 <- comparison %>%
rowwise() %>%
mutate(Max.p.val = max(Species1.p.val, Species2.p.val)) %>%
group_by(Species1) %>%
arrange(Max.p.val) %>%
slice(1)
comparison_species2_12 <- comparison %>%
rowwise() %>%
mutate(Max.p.val = max(Species1.p.val, Species2.p.val)) %>%
group_by(Species2) %>%
arrange(Max.p.val) %>%
slice(1)
cat("GENES:\n",
sum(comparison_species1$Species1.p.val< 0.05), " conserved ", species1_name, " genes (",
format(sum(comparison_species1$Species1.p.val< 0.05)/nrow(comparison_species1), digits = 3), ")\n",
sum(comparison_species2$Species2.p.val< 0.05), " conserved ", species2_name, " genes (",
format(sum(comparison_species2$Species2.p.val< 0.05)/nrow(comparison_species2), digits = 3), ")\n",
sum(comparison_species1_12$Max.p.val< 0.05), " reciprocally conserved ", species1_name, " genes (",
format(sum(comparison_species1_12$Max.p.val< 0.05 &
comparison_species1_12$Max.p.val< 0.05)/nrow(comparison_species1_12), digits = 3), ")\n",
sum(comparison_species2_12$Max.p.val< 0.05), " reciprocally conserved ", species2_name, " genes (",
format(sum(comparison_species2_12$Max.p.val< 0.05 &
comparison_species2_12$Max.p.val< 0.05)/nrow(comparison_species2_12), digits = 3), ")\n",
sep = "")
## GENES:
## 9528 conserved potri genes (0.496)
## 6981 conserved piabi genes (0.544)
## 8163 reciprocally conserved potri genes (0.425)
## 5858 reciprocally conserved piabi genes (0.456)
# Orthogroups
comparison_species1 <- comparison %>%
group_by(OrthoGroup) %>%
arrange(Species1.p.val) %>%
slice(1)
comparison_species2 <- comparison %>%
group_by(OrthoGroup) %>%
arrange(Species2.p.val) %>%
slice(1)
comparison_species12 <- comparison %>%
rowwise() %>%
mutate(Max.p.val = max(Species1.p.val, Species2.p.val)) %>%
group_by(OrthoGroup) %>%
arrange(Max.p.val) %>%
slice(1)
cat("ORTHOLOG GROUPS:\n",
sum(comparison_species1$Species1.p.val< 0.05), " conserved ", species1_name, " orthogroups (",
format(sum(comparison_species1$Species1.p.val< 0.05)/nrow(comparison_species1), digits = 3), ")\n",
sum(comparison_species2$Species2.p.val< 0.05), " conserved ", species2_name, " orthogroups (",
format(sum(comparison_species2$Species2.p.val< 0.05)/nrow(comparison_species2), digits = 3), ")\n",
sum(comparison_species12$Max.p.val< 0.05), " reciprocally conserved orthogroups (",
format(sum(comparison_species12$Max.p.val< 0.05 &
comparison_species12$Max.p.val< 0.05)/nrow(comparison_species12), digits = 3), ")\n", sep = "")
## ORTHOLOG GROUPS:
## 3630 conserved potri orthogroups (0.554)
## 3582 conserved piabi orthogroups (0.547)
## 3013 reciprocally conserved orthogroups (0.46)
Table of all comparisons.
potri_annot <- read.delim("data/Ptrichocarpa_210_v3.0.annotation_info.txt", header = F, sep = "\t")
potri_annot <- potri_annot[, c("V2", "V12", "V13")]
colnames(potri_annot) <- c("Species1", "Symbol", "Description")
potri_annot <- potri_annot %>%
group_by(Species1) %>%
slice(1)
comparison_table <- comparison %>%
rowwise() %>%
mutate(Max.p.val = max(Species1.p.val, Species2.p.val)) %>%
left_join(potri_annot, by = "Species1") %>%
select(-c("Species1.neigh", "Species1.ortho.neigh", "Species2.neigh", "Species2.ortho.neigh")) %>%
arrange(Max.p.val)
comparison_table$Species1.p.val <- format(comparison_table$Species1.p.val, digits = 3, scientific = TRUE)
comparison_table$Species2.p.val <- format(comparison_table$Species2.p.val, digits = 3, scientific = TRUE)
comparison_table$Max.p.val <- format(comparison_table$Max.p.val, digits = 3, scientific = TRUE)
datatable(comparison_table, rownames = FALSE, filter = "top",
options = list(
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)